In your git repo for the assignment have the python file that scrapes the data, the streamlit python file and your gif.
Note: Only number 1, 4 and 5 are requested per class announcement
import requests as rq
import bs4
import pandas as pd
import plotly.express as px
from io import StringIO
import plotly.io as pio
pio.renderers.default = 'jupyterlab'
#code copied from the https://smart-stats.github.io/ds4bio_book/book/_build/html/webscraping.html with the exception of the url
url = 'https://en.m.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
page = rq.get(url)
page.text[0 : 99]
'<!DOCTYPE html>\n<html class="client-nojs mf-expand-sections-clientpref-0 mf-font-size-clientpref-sm'
#code copied from the https://smart-stats.github.io/ds4bio_book/book/_build/html/webscraping.html
bs4page = bs4.BeautifulSoup(page.text, 'html.parser')
tables = bs4page.find_all('table',{'class':"wikitable"})
#variables were changed to account that it is a GDP
GDP = pd.read_html(StringIO(str(tables[0])))[0]
GDP = medals.dropna()
GDP.head()
| Country/Territory | UN region | IMF[1][13] | World Bank[14] | United Nations[15] | ||||
|---|---|---|---|---|---|---|---|---|
| Country/Territory | UN region | Forecast | Year | Estimate | Year | Estimate | Year | |
| 0 | World | — | 104476432 | 2023 | 100562011 | 2022 | 96698005 | 2021 |
| 1 | United States | Americas | 26949643 | 2023 | 25462700 | 2022 | 23315081 | 2021 |
| 2 | China | Asia | 17700899 | [n 1]2023 | 17963171 | [n 3]2022 | 17734131 | [n 1]2021 |
| 3 | Germany | Europe | 4429838 | 2023 | 4072192 | 2022 | 4259935 | 2021 |
| 4 | Japan | Asia | 4230862 | 2023 | 4231141 | 2022 | 4940878 | 2021 |
#cleaning data as some of the data has [n +number], which is a the wiki page use to point to a reference
# using regex learned from https://www.w3schools.com/python/python_regex.asp
for col in GDP.columns:
GDP[col] = GDP[col].str.replace(r'\[n \d+\]', '', regex=True)
GDP.head()
| Country/Territory | UN region | IMF[1][13] | World Bank[14] | United Nations[15] | ||||
|---|---|---|---|---|---|---|---|---|
| Country/Territory | UN region | Forecast | Year | Estimate | Year | Estimate | Year | |
| 0 | World | — | 104476432 | 2023 | 100562011 | 2022 | 96698005 | 2021 |
| 1 | United States | Americas | 26949643 | 2023 | 25462700 | 2022 | 23315081 | 2021 |
| 2 | China | Asia | 17700899 | 2023 | 17963171 | 2022 | 17734131 | 2021 |
| 3 | Germany | Europe | 4429838 | 2023 | 4072192 | 2022 | 4259935 | 2021 |
| 4 | Japan | Asia | 4230862 | 2023 | 4231141 | 2022 | 4940878 | 2021 |
#renaming the header columns and remove [n] as we are still having issues at the header
GDP = GDP.rename(columns={'IMF[1][13]': 'IMF'})
GDP = GDP.rename(columns={'World Bank[14]': 'World Bank'})
GDP = GDP.rename(columns={'United Nations[15]': 'United Nations'})
GDP.head()
| Country/Territory | UN region | IMF | World Bank | United Nations | ||||
|---|---|---|---|---|---|---|---|---|
| Country/Territory | UN region | Forecast | Year | Estimate | Year | Estimate | Year | |
| 0 | World | — | 104476432 | 2023 | 100562011 | 2022 | 96698005 | 2021 |
| 1 | United States | Americas | 26949643 | 2023 | 25462700 | 2022 | 23315081 | 2021 |
| 2 | China | Asia | 17700899 | 2023 | 17963171 | 2022 | 17734131 | 2021 |
| 3 | Germany | Europe | 4429838 | 2023 | 4072192 | 2022 | 4259935 | 2021 |
| 4 | Japan | Asia | 4230862 | 2023 | 4231141 | 2022 | 4940878 | 2021 |
#Now the data looks good. Double checking that the columns of interest are numeric
from pandas.api.types import is_numeric_dtype
is_numeric_dtype(GDP['IMF']['Forecast'])
False
#The data does not seem numeric. As such, we will convert to numeric
GDP.loc[:, ("IMF", "Forecast")] = pd.to_numeric(GDP.loc[:, ("IMF", "Forecast")], errors="coerce")
GDP.head()
| Country/Territory | UN region | IMF | World Bank | United Nations | ||||
|---|---|---|---|---|---|---|---|---|
| Country/Territory | UN region | Forecast | Year | Estimate | Year | Estimate | Year | |
| 0 | World | — | 104476432.0 | 2023 | 100562011 | 2022 | 96698005 | 2021 |
| 1 | United States | Americas | 26949643.0 | 2023 | 25462700 | 2022 | 23315081 | 2021 |
| 2 | China | Asia | 17700899.0 | 2023 | 17963171 | 2022 | 17734131 | 2021 |
| 3 | Germany | Europe | 4429838.0 | 2023 | 4072192 | 2022 | 4259935 | 2021 |
| 4 | Japan | Asia | 4230862.0 | 2023 | 4231141 | 2022 | 4940878 | 2021 |
#We will check for any column that NaN
GDP.loc[:, GDP.isna().any()]
| IMF | |
|---|---|
| Forecast | |
| 0 | 104476432.0 |
| 1 | 26949643.0 |
| 2 | 17700899.0 |
| 3 | 4429838.0 |
| 4 | 4230862.0 |
| ... | ... |
| 209 | 267.0 |
| 210 | 246.0 |
| 211 | 150.0 |
| 212 | NaN |
| 213 | 63.0 |
214 rows × 1 columns
#Found some. Therefore, we will drop NaN from the data for simplifying the analysis
GDP = GDP.dropna()
#dropping the row with the name "world" in Country/Territory as we only need countries. The world is the first row, aka, index 0
GDP = GDP.iloc[1:, :]
#checking success
GDP
| Country/Territory | UN region | IMF | World Bank | United Nations | ||||
|---|---|---|---|---|---|---|---|---|
| Country/Territory | UN region | Forecast | Year | Estimate | Year | Estimate | Year | |
| 1 | United States | Americas | 26949643.0 | 2023 | 25462700 | 2022 | 23315081 | 2021 |
| 2 | China | Asia | 17700899.0 | 2023 | 17963171 | 2022 | 17734131 | 2021 |
| 3 | Germany | Europe | 4429838.0 | 2023 | 4072192 | 2022 | 4259935 | 2021 |
| 4 | Japan | Asia | 4230862.0 | 2023 | 4231141 | 2022 | 4940878 | 2021 |
| 5 | India | Asia | 3732224.0 | 2023 | 3385090 | 2022 | 3201471 | 2021 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 208 | Marshall Islands | Oceania | 277.0 | 2023 | 280 | 2022 | 257 | 2021 |
| 209 | Palau | Oceania | 267.0 | 2023 | — | — | 218 | 2021 |
| 210 | Kiribati | Oceania | 246.0 | 2023 | 223 | 2022 | 227 | 2021 |
| 211 | Nauru | Oceania | 150.0 | 2023 | 151 | 2022 | 155 | 2021 |
| 213 | Tuvalu | Oceania | 63.0 | 2023 | 60 | 2022 | 60 | 2021 |
191 rows × 8 columns
#We only want certain columns associated with regions and IMF data only. Since this is a multilevel datafram, we are creating a new dataframe with IMF columns needed for further analysis,
# and name is GDP_IMF and dropping the rest
GDP_IMF = GDP[[('Country/Territory', 'Country/Territory'), ('UN region', 'UN region'), ('IMF', 'Forecast'), ('IMF', 'Year')]].droplevel(0, axis=1)
GDP_IMF.columns = ['Country/Territory', 'UN region', 'IMF Forecast', 'IMF Year']
#checking success whether we created on level datafram
GDP_IMF
| Country/Territory | UN region | IMF Forecast | IMF Year | |
|---|---|---|---|---|
| 1 | United States | Americas | 26949643.0 | 2023 |
| 2 | China | Asia | 17700899.0 | 2023 |
| 3 | Germany | Europe | 4429838.0 | 2023 |
| 4 | Japan | Asia | 4230862.0 | 2023 |
| 5 | India | Asia | 3732224.0 | 2023 |
| ... | ... | ... | ... | ... |
| 208 | Marshall Islands | Oceania | 277.0 | 2023 |
| 209 | Palau | Oceania | 267.0 | 2023 |
| 210 | Kiribati | Oceania | 246.0 | 2023 |
| 211 | Nauru | Oceania | 150.0 | 2023 |
| 213 | Tuvalu | Oceania | 63.0 | 2023 |
191 rows × 4 columns
#Stack countries within regions using the IMF numbers using plotly. the x-axis is the region of interest, the y is the GDP, and color coded by the country
fig = px.bar(GDP_IMF, x='UN region', y='IMF Forecast', color='Country/Territory', barmode='stack')
#Adjusting the figure size
fig.update_layout(width=1200, height=800)
fig.show()
html = fig.to_html(full_html=False)
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.io as pio
pio.renderers.default = 'jupyterlab'
#this portion of the code was copied from https://smart-stats.github.io/ds4bio_book/book/_build/html/interactive.html per the directive of the assignment. Nothing was altered here.
url = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
multilevel_lookup = pd.read_csv(url, sep = "\t").drop(['Level5'], axis = 1)
multilevel_lookup = multilevel_lookup.rename(columns = {
"modify" : "roi",
"modify.1" : "level4",
"modify.2" : "level3",
"modify.3" : "level2",
"modify.4" : "level1"})
multilevel_lookup = multilevel_lookup[['roi', 'level4', 'level3', 'level2', 'level1']]
multilevel_lookup.head()
id = 127
subjectData = pd.read_csv("kirby21AllLevels.csv")
subjectData = subjectData.loc[(subjectData.type == 1) & (subjectData.level == 5) & (subjectData.id == id)]
subjectData = subjectData[['roi', 'volume']]
## Merge the subject data with the multilevel data
subjectData = pd.merge(subjectData, multilevel_lookup, on = "roi")
subjectData = subjectData.assign(icv = "ICV")
subjectData = subjectData.assign(comp = subjectData.volume / np.sum(subjectData.volume))
subjectData.head()
| roi | volume | level4 | level3 | level2 | level1 | icv | comp | |
|---|---|---|---|---|---|---|---|---|
| 0 | SFG_L | 12926 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.009350 |
| 1 | SFG_R | 10050 | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R | ICV | 0.007270 |
| 2 | SFG_PFC_L | 12783 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.009247 |
| 3 | SFG_PFC_R | 11507 | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R | ICV | 0.008324 |
| 4 | SFG_pole_L | 3078 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.002227 |
#Performing the sanskey diagram per the assignment. Display this subject's data as a Sankey diagram. Display as many levels as you can (at least 3) for Type = 1, starting from the intracranial volume.
import plotly.graph_objects as go
#using the melt submodule of sanskey to compose data in the format needed for sanskey of two variables, variable, and value
subjectData_Sanskey = subjectData.melt(id_vars = ["icv", "comp"], value_vars = ["level1", "level2", "level3", "level4"], var_name = "level", value_name = "target")
# assembling the nodes from the subject data using the given the column variables. The icv is the source, and the target are the levels. The value of comparison is comp varaible.
nodes = list(subjectData_Sanskey["icv"].unique()) + list(subjectData_Sanskey["target"].unique())
#creating a dictionary for the nodes for association
node_dictionary = {node: i for i, node in enumerate(nodes)}
# here we are specifying the the required variables for sanskey, which is the source, the target (connection) and the value {size relationship}
source = [node_dictionary[node] for node in subjectData_Sanskey["icv"]]
target = [node_dictionary[node] for node in subjectData_Sanskey["target"]]
value = list(subjectData_Sanskey["comp"])
#generating the sanskey plot, this is a high level for level 1 only
fig = go.Figure(data = go.Sankey(
node = dict(
label = nodes
),
link = dict(
source = source,
target = target,
value = value,
hovertemplate='Source: %{source.label}<br>'
'Target: %{target.label}<br>'
'Comp: %{value}<extra></extra>'
)
))
#title and adjusting the window size
fig.update_layout(title_text = "MRI cloudData subject id 127 Sanskey diagram")
fig.update_layout(width=1200, height=2000)
fig.show()
#Performing a sanskey diagram encompassing all levels from 1 to 3 and connecting the nodes
#First we will sort the subjectData by level1 for ease, in case we need to debug
subjectData = subjectData.sort_values(by='volume')
#redoing the assignment of unique nodes, but in this case, we are incorporating level2 and level3 for specificty of heirarchy later one
nodes = list(subjectData['icv']) + list(subjectData['level1']) + list(subjectData['level2']) + list(
subjectData['level3'])
#creating a dictionary for the nodes for association
node_dictionary = {node: i for i, node in enumerate(nodes)}
# creating an empty tuple and populate them from the loop below
source = []
target = []
value = []
#this is a loop that iterates over the subjectData an assign, wherby values within the icv is the source, and levels are the target, and comp is value realtionship
for i in range(len(subjectData) - 1):
source.append(node_dictionary[subjectData.iloc[i]['icv']])
target.append(node_dictionary[subjectData.iloc[i]['level1']])
value.append(subjectData.iloc[i]['comp'])
source.append(node_dictionary[subjectData.iloc[i]['level1']])
target.append(node_dictionary[subjectData.iloc[i]['level2']])
value.append(subjectData.iloc[i]['comp'])
source.append(node_dictionary[subjectData.iloc[i]['level2']])
target.append(node_dictionary[subjectData.iloc[i]['level3']])
value.append(subjectData.iloc[i]['comp'])
#generating the sanskey plot, this is for level 1-3 heirarchy starting at icv
fig = go.Figure(data=go.Sankey(
node=dict(
label=nodes
),
link=dict(
source=source,
target=target,
value=value,
hovertemplate='Source: %{source.label}<br>'
'Target: %{target.label}<br>'
'Comp: %{value}<extra></extra>'
)
))
#title and adjusting the window size
fig.update_layout(title_text="MRI cloudData subject id 127 Sanskey diagram")
fig.update_layout(width=1200, height=2000)
fig.show()
##Performing a sanskey diagram encompassing all levels including level 4
nodes = list(subjectData['icv']) + list(subjectData['level1']) + list(subjectData['level2']) + list(
subjectData['level3']) + list(subjectData['level4'])
#redoing the assignment of unique nodes, but in this case, we are incorporating level2 and level3 and level4 for specificty of heirarchy later one
node_dictionary = {node: i for i, node in enumerate(nodes)}
# creating an empty tuple and populate them from the loop below
source = []
target = []
value = []
#this is a loop that iterates over the subjectData an assign, wherby values within the icv is the source, and levels are the target, and comp is value realtionship
for i in range(len(subjectData) - 1):
source.append(node_dictionary[subjectData.iloc[i]['icv']])
target.append(node_dictionary[subjectData.iloc[i]['level1']])
value.append(subjectData.iloc[i]['comp'])
source.append(node_dictionary[subjectData.iloc[i]['level1']])
target.append(node_dictionary[subjectData.iloc[i]['level2']])
value.append(subjectData.iloc[i]['comp'])
source.append(node_dictionary[subjectData.iloc[i]['level2']])
target.append(node_dictionary[subjectData.iloc[i]['level3']])
value.append(subjectData.iloc[i]['comp'])
source.append(node_dictionary[subjectData.iloc[i]['level3']])
target.append(node_dictionary[subjectData.iloc[i]['level4']])
value.append(subjectData.iloc[i]['comp'])
#generating the sanskey plot, this is for level 1-4 heirarchy starting at icv
fig = go.Figure(data=go.Sankey(
node=dict(
label=nodes
),
link=dict(
source=source,
target=target,
value=value,
hovertemplate='Source: %{source.label}<br>'
'Target: %{target.label}<br>'
'Value: %{value}<extra></extra>'
)
))
#title and adjusting the window size
fig.update_layout(title_text="MRI cloudData subject id 127 Sanskey diagram")
fig.update_layout(width=1200, height=3000)
fig.show()
#creating different visuals for fun and see how it turns out using geometrical
#using the melt submodule of sanskey to compose data in the format needed for sanskey of two variables, variable, and value
subjectData_Sanskey = subjectData.melt(id_vars=["icv", "comp"], value_vars=["level1", "level2", "level3"],
var_name="level", value_name="target")
##Performing a sanskey diagram encompassing all levels including level 4 and assigning the nodes
nodes = list(subjectData_Sanskey["icv"].unique()) + list(subjectData_Sanskey["target"].unique())
# the assignment of unique nodes into a dictionary for associating nodes
node_dictionary = {node: i for i, node in enumerate(nodes)}
# creating an empty tuple and populate them from the loop below
source = []
target = []
value = list(subjectData_Sanskey["comp"])
#this is a loop that iterates over the subjectData an assign, wherby values within the icv is the source, and levels are the target only. Comp is already assigned into a list above.
for i in range(1, len(subjectData_Sanskey)):
source.append(node_dictionary[subjectData_Sanskey.iloc[i - 1]["target"]])
target.append(node_dictionary[subjectData_Sanskey.iloc[i]["target"]])
#generating the sanskey plot, this is for level 1-4 heirarchy starting at icv
fig = go.Figure(data=go.Sankey(
node=dict(
label=nodes
),
link=dict(
source=source,
target=target,
value=value,
hovertemplate='Source: %{source.label}<br>'
'Target: %{target.label}<br>'
'Value: %{value}<extra></extra>'
)
))
#title and adjusting the window size
fig.update_layout(title_text="MRI cloudData subject id 127 Sanskey diagram (geometrical")
fig.update_layout(width=2000, height=1500)
fig.show()